In this notebook. only include the analysis of the whole data after filtering, and the integration part.
#REPO code for CF_seurat analysis
library(dplyr)
library(Seurat)
library(patchwork)
library(data.table)
library(R.utils)
library(ggplot2)
library(patchwork)
set.seed(412)
#load gene matrix
mat<-Seurat::ReadMtx("./CF_data/GSE145360_Sputum.data.processed.mtx",cells="./CF_data/GSE145360_Sputum.processed.barcodes.tsv",
features="./CF_data/GSE145360_Sputum.processed.genes.tsv", feature.column = 1)
#load metadata
meta <- data.frame(fread("https://cells.ucsc.edu/human-sputum/meta.tsv"), row.names=1)
# Initialize the Seurat object with the raw (non-normalized data).
#New seurat object for paper analysis
cf <- CreateSeuratObject(counts = mat, project = "cf", min.cells = 3, min.features = 200,
meta.data=meta)
# 1. QC -------
Idents(cf) <- cf@meta.data$disease.ident
# get percent mitochondrial genes
cf[["percent.mt"]] <- PercentageFeatureSet(cf, pattern = "^MT-")
# Visualize QC metrics as a violin plot
VlnPlot(cf, features = c("nFeature_RNA", "nCount_RNA", "percent.mt")
#, ncol = 3
,group.by = NULL
)
## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.
plot1 <- FeatureScatter(cf, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(cf, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")+geom_smooth(method = 'lm')
plot1 + plot2
## `geom_smooth()` using formula = 'y ~ x'
# data filtering -----------------
cf.1 <- subset(cf, subset = nFeature_RNA>200 & nFeature_RNA < 7000 & percent.mt <5)
# split object
cf.1[["RNA"]] <- split(cf.1[["RNA"]], f = cf.1$disease.ident)
# normalize data
cf.1 <- NormalizeData(cf.1)
## Normalizing layer: counts.CF
## Normalizing layer: counts.CTRL
# feature selection(find highly variable features)
cf.1 <- FindVariableFeatures(cf.1, selection.method = "vst", nfeatures = 3000)
## Finding variable features for layer counts.CF
## Finding variable features for layer counts.CTRL
#
# ## Identify the 10 most highly variable genes
# top10 <- head(VariableFeatures(cf.1), 10)
#
# ## plot variable features with and without labels
# plot1 <- VariableFeaturePlot(cf.1)
# plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
# plot1 + plot2
# scale to mean 0 and variance 1
all.genes <- rownames(cf.1)
cf.1<-ScaleData(cf.1, features=all.genes, verbose=F)
# regress out percentage number of mitochondrial genes
cf.1 <-ScaleData(cf.1, vars.to.regress = c("nFeature_RNA", "nCount_RNA","percent.mt"), do.scale=F, do.center=F, verbose=F)
## Warning: Different features in new layer data than already exists for
## scale.data
#PCA dimensionaily reduction on subset of scaled variable features
cf.1 <- RunPCA(cf.1, features = VariableFeatures(object = cf.1))
## PC_ 1
## Positive: PLEK, ACSL1, DOCK4, CSF3R, FOS, TNFRSF1B, SLC2A3, IFITM2, G0S2, CHST11
## SAMSN1, FPR1, C1QA, AC006369.1, LUCAT1, APOC1, C1QB, MARCO, PRKCB, PFKFB3
## OLR1, SLC25A37, C1QC, FABP4, HLA-DPA1, HLA-DPB1, HLA-DQA1, PROK2, FABP5, KCNJ15
## Negative: SPRR3, KRT13, LCN2, ECM1, MAL, LYPD2, PSCA, TMPRSS11B, EMP1, KRT19
## SPRR2A, KRT78, SCEL, PRSS27, MUC4, PRSS22, DUSP5, ELF3, NDRG2, AC118754.1
## TMPRSS2, PARD3, MUC21, PITX1, PPL, FAM129B, SPINK5, FBXO32, C6orf132, TMPRSS11E
## PC_ 2
## Positive: G0S2, BASP1, PTGS2, IFITM2, SLC25A37, RGS2, SAMSN1, CXCL8, PFKFB3, SLC2A3
## PRKCB, ZFP36, PROK2, LUCAT1, MXD1, CSF3R, ANTXR2, TNFRSF1B, ZFP36L1, MCTP2
## MARCKS, IER3, OSM, SIPA1L1, S100A12, PLEK, PLXNC1, FPR1, NFKBIA, IL1B
## Negative: FABP4, APOC1, C1QB, C1QA, APOE, MARCO, C1QC, SCD, NUPR1, FABP5
## LSAMP, PPARG, CD52, HLA-DQA1, CTSL, HLA-DPB1, HLA-DRA, FN1, CD74, ANO5
## TPRG1, HLA-DPA1, CCL18, HLA-DRB1, TXNIP, AL365295.1, RBP4, AC092691.1, ANXA1, LYZ
## PC_ 3
## Positive: PROK2, MCTP2, ACSL1, ALPL, DYSF, FCGR3B, RUBCNL, S100A8, S100A9, KCNJ15
## PFKFB3, SORL1, IFRD1, SELL, CREB5, ADGRG3, CMTM2, AL049651.1, FABP4, CXCR2
## OSM, SLC25A37, IL1RAP, FPR1, LSAMP, PTGS2, AC105402.3, AL034397.3, S100A12, FPR2
## Negative: C15orf48, CCL3L1, FNIP2, CCL3, CCL4L2, CCL4, VEGFA, JARID2, SERPINB9, HIF1A-AS2
## IER3, CXCR4, GBE1, HES4, SLAMF7, AC025580.2, TLR2, CXCL2, TNFAIP3, TIMP1
## TRAF1, DOCK4, PID1, ICAM1, EREG, GPR183, KYNU, NFKBIA, AZIN1-AS1, VCAN
## PC_ 4
## Positive: LYZ, IL1B, TIMP1, OLR1, IL1RN, S100A8, S100A9, ANXA1, PTGS2, S100A12
## HLA-DRA, CTSL, AC020656.1, CCL3, ACSL1, KYNU, RPLP1, FCN1, HSPA1A, SLC2A3
## HSP90AA1, VCAN, CD74, EHD1, RPS12, SOCS3, HLA-DRB1, EREG, CXCL2, IER3
## Negative: ZEB1, AC006369.1, KATNBL1, PRKCB, CXCR4, PLAU, AC099489.1, RASSF2, PLPP3, CD22
## GBE1, CHST11, EPHB1, INPP5A, HIF1A-AS2, RIPOR2, SLC38A1, GLT1D1, SULF2, PPP1R16B
## KIAA0319, RHOH, CARD11, BANK1, SNAPC1, SPTBN1, IL1R1, RNF103-CHMP3, PLXNC1, ST6GAL1
## PC_ 5
## Positive: AC090498.1, GPR183, RPS12, HLA-DRA, SERPINB9, CD74, RPLP1, HLA-DRB1, RFTN1, MEF2C
## HLA-DPB1, HLA-DPA1, ST8SIA4, TCF4, BCL2, ARID5B, HSP90AA1, ADAM19, RUNX3, PPP1R16B
## SAMSN1, PDE4D, MTSS1, SIPA1L1, XYLT1, HLA-DQA1, HSPA1A, SLC2A3, ST6GAL1, ENTPD1
## Negative: CCL3L1, CCL4L2, AC006369.1, PLAU, CXCL8, CCL4, CCL3, HCAR3, KATNBL1, MAFF
## LUCAT1, FOS, GBE1, CXCL2, NFKBIA, IER3, PLPP3, AC005050.3, HCAR2, IER2
## AC099489.1, TLR2, DOCK4, TNFAIP3, AC023157.3, MXD1, TNFAIP6, ZFP36, LSAMP, PI3
#visualize reduced dimensions
VizDimLoadings(cf.1, dims = 1:10, reduction = "pca")
#dimension heat map(first dimension- primary sources for heterogen.)
#cells and features ordered by PCA scores
#DimHeatmap(cf.1, dims = 1:20, cells = 500, balanced = TRUE)
#Determine "dimensionality" of dataset (paper used 12)
ElbowPlot(cf.1, ndims=20, reduction="pca")
We use rpca here after comparing our results with CCA and Harmony.
# Perform streamlined (one-line) integrative analysis--------------------
CF_integration.1 <- IntegrateLayers(
object = cf.1, method = RPCAIntegration,
orig.reduction = "pca", new.reduction = 'integrated.rpca',
verbose = FALSE)
CF_integration.1 <- FindNeighbors(CF_integration.1, reduction = "integrated.rpca", dims = 1:15)
## Computing nearest neighbor graph
## Computing SNN
CF_integration.1 <- FindClusters(CF_integration.1, resolution =c(0.1,0.2,0.3,0.5), cluster.name = c("rpca_clusters_0.1","rpca_clusters_0.2","rpca_clusters_0.3","rpca_clusters_0.5"))
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 20495
## Number of edges: 669914
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9648
## Number of communities: 6
## Elapsed time: 3 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 20495
## Number of edges: 669914
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9418
## Number of communities: 7
## Elapsed time: 3 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 20495
## Number of edges: 669914
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9252
## Number of communities: 9
## Elapsed time: 3 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 20495
## Number of edges: 669914
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9018
## Number of communities: 14
## Elapsed time: 3 seconds
CF_integration.1 <- RunUMAP(CF_integration.1, reduction = "integrated.rpca", dims = 1:15, reduction.name = "umap.rpca")
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 15:18:53 UMAP embedding parameters a = 0.9922 b = 1.112
## 15:18:53 Read 20495 rows and found 15 numeric columns
## 15:18:53 Using Annoy for neighbor search, n_neighbors = 30
## 15:18:53 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 15:18:56 Writing NN index file to temp file /var/folders/0r/r2dplk4d72d437ygr7ycq8kw0000gn/T//RtmpX7AYON/file612c1e077b0b
## 15:18:56 Searching Annoy index using 1 thread, search_k = 3000
## 15:19:02 Annoy recall = 100%
## 15:19:04 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 15:19:05 Initializing from normalized Laplacian + noise (using RSpectra)
## 15:19:06 Commencing optimization for 200 epochs, with 851070 positive edges
## 15:19:17 Optimization finished
DimPlot(CF_integration.1, reduction = "umap.rpca"
, group.by = c("rpca_clusters_0.1","rpca_clusters_0.2"
,"rpca_clusters_0.3","rpca_clusters_0.5")
,label = T)
#
# DimPlot(CF_integration.1, reduction = "umap.unintegrated"
# , group.by = c("rpca_clusters_0.1","rpca_clusters_0.2"
# ,"rpca_clusters_0.3","rpca_clusters_0.5")
# ,label = T)
# findConserved markers (resolution=0.1)--------------------
CF_integration.1 <- JoinLayers(CF_integration.1)
Idents(CF_integration.1) <- CF_integration.1@meta.data$rpca_clusters_0.1
# cluster0
marker_r1_0 <- FindConservedMarkers(CF_integration.1, ident.1 = 0, grouping.var = "disease.ident", verbose = FALSE)
head(marker_r1_0,20)
#FCGR3B,CXCR2,ALPL -- PMN by paper
VlnPlot(CF_integration.1, features=c("CXCR4", "IGF2R","ALPL"))
VlnPlot(CF_integration.1, features=c("FCGR3B", "CXCR2","ALPL"))
# MRC1 or MARCO typically detected on alvMΦs
VlnPlot(CF_integration.1, features=c("MRC1", "MARCO"))
After several searches, we name each cluster and plot the named clusters
as below:
CF_integration.1 <- RenameIdents(CF_integration.1, `0` = 'PMN')
CF_integration.1 <- RenameIdents(CF_integration.1, `1` = 'AM2')
CF_integration.1 <- RenameIdents(CF_integration.1, `2` = 'M/Momo')#
CF_integration.1 <- RenameIdents(CF_integration.1, `3` = 'epithelial')
CF_integration.1 <- RenameIdents(CF_integration.1, `4` = 'T/B')
CF_integration.1 <- RenameIdents(CF_integration.1, `5` = 'AM1')
# plot_clustering1 <- DimPlot(CF_integration.1,reduction = c("umap.unintegrated")
# , label.size = 4,label = T)
plot_clustering2 <- DimPlot(CF_integration.1,reduction = c("umap.rpca")
, label.size = 4,label = T)
# plot_clustering1|
plot_clustering2
DimPlot(CF_integration.1,reduction = c("umap.rpca")
,split.by = 'disease.ident',, label.size = 4,label = T)
# findMarkers between conditions ---------------------
CF_integration.1$celltype.cnd <- paste0(CF_integration.1$rpca_clusters_0.1,'_', CF_integration.1$disease.ident)
#View(CF_integration.1@meta.data)
Idents(CF_integration.1) <- CF_integration.1$celltype.cnd
DimPlot(CF_integration.1, reduction = 'umap.rpca', label = TRUE)
# find markers
b.interferon.response <- FindMarkers(CF_integration.1, ident.1 = '1_CF', ident.2 = '1_CTRL')
head(b.interferon.response)
# plotting conserved features vs DE features between conditions
#head(marker_r1_1)
FeaturePlot(CF_integration.1, features = c('G0S2', 'PRKCB', 'ANO5','AC026369.3'), split.by = 'disease.ident', min.cutoff = 'q10')